import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
np.random.seed(SEED)
[(x.__name__,x.__version__) for x in [np,pd,sns]]
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]
pd.options.display.max_columns = None
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import datetime
import functools
from sklearn.model_selection import train_test_split
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
[(x.__name__,x.__version__) for x in [plotly]]
[('plotly', '4.5.2')]
!ls ../data/raw
LCDataDictionary.xlsx loan_data_2007_2014.csv loan_data_2015.csv
!du -sh ../data/raw/loan_data_2007_2014.csv
229M ../data/raw/loan_data_2007_2014.csv
dat_raw = '../data/raw/'
dat_pro = '../data/processed/'
df = pd.read_csv(dat_raw + 'loan_data_2007_2014.csv',low_memory=False)
print(df.shape)
df.head(2).append(df.tail(2))
(466285, 75)
| Unnamed: 0 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_il_6m | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1077501 | 1296599 | 5000 | 5000 | 4975.0 | 36 months | 10.65 | 162.87 | B | B2 | NaN | 10+ years | RENT | 24000.0 | Verified | Dec-11 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.... | Borrower added on 12/22/11 > I need to upgra... | credit_card | Computer | 860xx | AZ | 27.65 | 0.0 | Jan-85 | 1.0 | NaN | NaN | 3.0 | 0.0 | 13648 | 83.7 | 9.0 | f | 0.00 | 0.00 | 5861.071414 | 5831.78 | 5000.00 | 861.07 | 0.0 | 0.00 | 0.00 | Jan-15 | 171.62 | NaN | Jan-16 | 0.0 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1 | 1077430 | 1314167 | 2500 | 2500 | 2500.0 | 60 months | 15.27 | 59.83 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | Source Verified | Dec-11 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.... | Borrower added on 12/22/11 > I plan to use t... | car | bike | 309xx | GA | 1.00 | 0.0 | Apr-99 | 5.0 | NaN | NaN | 3.0 | 0.0 | 1687 | 9.4 | 4.0 | f | 0.00 | 0.00 | 1008.710000 | 1008.71 | 456.46 | 435.17 | 0.0 | 117.08 | 1.11 | Apr-13 | 119.66 | NaN | Sep-13 | 0.0 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 466283 | 466283 | 9604874 | 11457002 | 2000 | 2000 | 2000.0 | 36 months | 7.90 | 62.59 | A | A4 | Server Engineer Lead | 3 years | OWN | 83000.0 | Verified | Jan-14 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.... | NaN | credit_card | Credit card refinancing | 913xx | CA | 5.39 | 3.0 | Feb-03 | 1.0 | 13.0 | NaN | 21.0 | 0.0 | 11404 | 21.5 | 27.0 | w | 0.00 | 0.00 | 2126.579838 | 2126.58 | 2000.00 | 126.58 | 0.0 | 0.00 | 0.00 | Dec-14 | 1500.68 | NaN | Apr-15 | 0.0 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | 0.0 | 0.0 | 591610.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 53100.0 | NaN | NaN | NaN |
| 466284 | 466284 | 9199665 | 11061576 | 10000 | 10000 | 9975.0 | 36 months | 19.20 | 367.58 | D | D3 | NaN | 10+ years | MORTGAGE | 46000.0 | Verified | Jan-14 | Current | n | https://www.lendingclub.com/browse/loanDetail.... | Borrower added on 12/04/13 > I will like a l... | other | Other | 950xx | CA | 22.78 | 1.0 | Feb-00 | 0.0 | 9.0 | NaN | 6.0 | 0.0 | 11325 | 70.8 | 22.0 | f | 3984.38 | 3974.41 | 8821.620000 | 8799.57 | 6015.62 | 2806.00 | 0.0 | 0.00 | 0.00 | Jan-16 | 367.58 | Feb-16 | Jan-16 | 0.0 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | 0.0 | 0.0 | 57477.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16000.0 | NaN | NaN | NaN |
df_copy = df.copy(deep=True)
# df = df_copy.copy()
col_floats = df.select_dtypes(np.float64).columns
for col in col_floats:
df[col] = df[col].astype(np.float32)
col_ints = df.select_dtypes(np.int64).columns
for col in col_ints:
df[col] = df[col].astype(np.int32)
df.select_dtypes('object').head(1).T
| 0 | |
|---|---|
| term | 36 months |
| grade | B |
| sub_grade | B2 |
| emp_title | NaN |
| emp_length | 10+ years |
| home_ownership | RENT |
| verification_status | Verified |
| issue_d | Dec-11 |
| loan_status | Fully Paid |
| pymnt_plan | n |
| url | https://www.lendingclub.com/browse/loanDetail.... |
| desc | Borrower added on 12/22/11 > I need to upgra... |
| purpose | credit_card |
| title | Computer |
| zip_code | 860xx |
| addr_state | AZ |
| earliest_cr_line | Jan-85 |
| initial_list_status | f |
| last_pymnt_d | Jan-15 |
| next_pymnt_d | NaN |
| last_credit_pull_d | Jan-16 |
| application_type | INDIVIDUAL |
cols_date = ['issue_d',
'earliest_cr_line',
'last_pymnt_d',
'last_credit_pull_d']
for col in cols_date:
df[col+'_date'] = pd.to_datetime(df[col],
format='%b-%y',
errors='coerce')
df[cols_date].dtypes
issue_d object earliest_cr_line object last_pymnt_d object last_credit_pull_d object dtype: object
df[cols_date].head(2)
| issue_d | earliest_cr_line | last_pymnt_d | last_credit_pull_d | |
|---|---|---|---|---|
| 0 | Dec-11 | Jan-85 | Jan-15 | Jan-16 |
| 1 | Dec-11 | Apr-99 | Apr-13 | Sep-13 |
today = datetime.datetime.today()
today
datetime.datetime(2020, 3, 31, 16, 6, 21, 636929)
today = datetime.datetime(2020, 3, 30)
today
datetime.datetime(2020, 3, 30, 0, 0)
df['earliest_cr_line_date'].head(2)
0 1985-01-01 1 1999-04-01 Name: earliest_cr_line_date, dtype: datetime64[ns]
df['earliest_cr_line_date'].isnull().sum()
29
for col in cols_date:
df['mths_since_'+col] = \
round((today - df[col+'_date']) /
np.timedelta64(1,'M'))
cols_months_since = ['mths_since_' + i for i in cols_date]
df[cols_months_since].describe().round(2).T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| mths_since_issue_d | 466285.0 | 79.26 | 14.34 | 64.0 | 69.0 | 75.0 | 85.0 | 154.0 |
| mths_since_earliest_cr_line | 466256.0 | 267.48 | 93.97 | -584.0 | 211.0 | 253.0 | 313.0 | 615.0 |
| mths_since_last_pymnt_d | 465909.0 | 59.29 | 12.81 | 51.0 | 51.0 | 52.0 | 63.0 | 148.0 |
| mths_since_last_credit_pull_d | 466243.0 | 55.05 | 9.64 | 51.0 | 51.0 | 51.0 | 53.0 | 155.0 |
df.select_dtypes('object').nunique().sort_values()
application_type 1 initial_list_status 2 pymnt_plan 2 term 2 verification_status 3 home_ownership 6 grade 7 loan_status 9 emp_length 11 purpose 14 sub_grade 35 addr_state 50 issue_d 91 last_pymnt_d 98 next_pymnt_d 100 last_credit_pull_d 103 earliest_cr_line 664 zip_code 888 title 63099 desc 124436 emp_title 205475 url 466285 dtype: int64
cols_cat_small = df.select_dtypes('object').nunique()[lambda x: x<12].index
for col in cols_cat_small:
print(col)
print(df[col].unique())
print()
term [' 36 months' ' 60 months'] grade ['B' 'C' 'A' 'E' 'F' 'D' 'G'] emp_length ['10+ years' '< 1 year' '1 year' '3 years' '8 years' '9 years' '4 years' '5 years' '6 years' '2 years' '7 years' nan] home_ownership ['RENT' 'OWN' 'MORTGAGE' 'OTHER' 'NONE' 'ANY'] verification_status ['Verified' 'Source Verified' 'Not Verified'] loan_status ['Fully Paid' 'Charged Off' 'Current' 'Default' 'Late (31-120 days)' 'In Grace Period' 'Late (16-30 days)' 'Does not meet the credit policy. Status:Fully Paid' 'Does not meet the credit policy. Status:Charged Off'] pymnt_plan ['n' 'y'] initial_list_status ['f' 'w'] application_type ['INDIVIDUAL']
df['term'].value_counts(dropna=False)
36 months 337953 60 months 128332 Name: term, dtype: int64
df['term_int'] = df['term'].str.extract('(\d+)').astype(int)
df['term_int'].unique()
array([36, 60])
df['emp_length'].value_counts(dropna=False)
10+ years 150049 2 years 41373 3 years 36596 < 1 year 36265 5 years 30774 1 year 29622 4 years 28023 7 years 26180 6 years 26112 8 years 22395 NaN 21008 9 years 17888 Name: emp_length, dtype: int64
df['emp_length_int'] = df['emp_length']\
.str.replace('< 1years','0')\
.str.extract(r'(\d+)').astype(np.float32)
df['emp_length_int'].value_counts(dropna=False)
10.0 150049 1.0 65887 2.0 41373 3.0 36596 5.0 30774 4.0 28023 7.0 26180 6.0 26112 8.0 22395 NaN 21008 9.0 17888 Name: emp_length_int, dtype: int64
df['loan_income_ratio']= df['loan_amnt']/df['annual_inc']
df.iloc[:2,-2:]
| emp_length_int | loan_income_ratio | |
|---|---|---|
| 0 | 10.0 | 0.208333 |
| 1 | 1.0 | 0.083333 |
df['loan_status'].value_counts()
Current 224226 Fully Paid 184739 Charged Off 42475 Late (31-120 days) 6900 In Grace Period 3146 Does not meet the credit policy. Status:Fully Paid 1988 Late (16-30 days) 1218 Default 832 Does not meet the credit policy. Status:Charged Off 761 Name: loan_status, dtype: int64
df['loan_status'].unique()
array(['Fully Paid', 'Charged Off', 'Current', 'Default',
'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
'Does not meet the credit policy. Status:Fully Paid',
'Does not meet the credit policy. Status:Charged Off'],
dtype=object)
cols_bad = ['Charged Off',
'Default',
'Does not meet the credit policy. Status:Charged Off',
'Late (31-120 days)']
df['good_bad'] = np.where(df['loan_status'].isin(cols_bad),0,1)
df['good_bad'].value_counts()
1 415317 0 50968 Name: good_bad, dtype: int64
pd.options.display.max_rows = 200
df.head(2)
| Unnamed: 0 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_il_6m | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | issue_d_date | earliest_cr_line_date | last_pymnt_d_date | last_credit_pull_d_date | mths_since_issue_d | mths_since_earliest_cr_line | mths_since_last_pymnt_d | mths_since_last_credit_pull_d | term_int | emp_length_int | loan_income_ratio | good_bad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1077501 | 1296599 | 5000 | 5000 | 4975.0 | 36 months | 10.65 | 162.869995 | B | B2 | NaN | 10+ years | RENT | 24000.0 | Verified | Dec-11 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.... | Borrower added on 12/22/11 > I need to upgra... | credit_card | Computer | 860xx | AZ | 27.65 | 0.0 | Jan-85 | 1.0 | NaN | NaN | 3.0 | 0.0 | 13648 | 83.699997 | 9.0 | f | 0.0 | 0.0 | 5861.071289 | 5831.779785 | 5000.000000 | 861.070007 | 0.0 | 0.000000 | 0.00 | Jan-15 | 171.619995 | NaN | Jan-16 | 0.0 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2011-12-01 | 1985-01-01 | 2015-01-01 | 2016-01-01 | 100.0 | 423.0 | 63.0 | 51.0 | 36 | 10.0 | 0.208333 | 1 |
| 1 | 1 | 1077430 | 1314167 | 2500 | 2500 | 2500.0 | 60 months | 15.27 | 59.830002 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | Source Verified | Dec-11 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.... | Borrower added on 12/22/11 > I plan to use t... | car | bike | 309xx | GA | 1.00 | 0.0 | Apr-99 | 5.0 | NaN | NaN | 3.0 | 0.0 | 1687 | 9.400000 | 4.0 | f | 0.0 | 0.0 | 1008.710022 | 1008.710022 | 456.459991 | 435.170013 | 0.0 | 117.080002 | 1.11 | Apr-13 | 119.660004 | NaN | Sep-13 | 0.0 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2011-12-01 | 1999-04-01 | 2013-04-01 | 2013-09-01 | 100.0 | 252.0 | 84.0 | 79.0 | 60 | 1.0 | 0.083333 | 0 |
missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()
df[missing_cols].head(2)
| loan_income_ratio | annual_inc | title | earliest_cr_line_date | total_acc | pub_rec | mths_since_earliest_cr_line | open_acc | earliest_cr_line | delinq_2yrs | inq_last_6mths | acc_now_delinq | mths_since_last_credit_pull_d | last_credit_pull_d_date | last_credit_pull_d | collections_12_mths_ex_med | revol_util | mths_since_last_pymnt_d | last_pymnt_d | last_pymnt_d_date | emp_length_int | emp_length | emp_title | tot_coll_amt | tot_cur_bal | total_rev_hi_lim | next_pymnt_d | mths_since_last_delinq | desc | mths_since_last_major_derog | mths_since_last_record | annual_inc_joint | inq_last_12m | total_cu_tl | inq_fi | dti_joint | max_bal_bc | open_rv_24m | open_rv_12m | il_util | total_bal_il | mths_since_rcnt_il | open_il_24m | open_il_12m | open_il_6m | open_acc_6m | verification_status_joint | all_util | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.208333 | 24000.0 | Computer | 1985-01-01 | 9.0 | 0.0 | 423.0 | 3.0 | Jan-85 | 0.0 | 1.0 | 0.0 | 51.0 | 2016-01-01 | Jan-16 | 0.0 | 83.699997 | 63.0 | Jan-15 | 2015-01-01 | 10.0 | 10+ years | NaN | NaN | NaN | NaN | NaN | NaN | Borrower added on 12/22/11 > I need to upgra... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0.083333 | 30000.0 | bike | 1999-04-01 | 4.0 | 0.0 | 252.0 | 3.0 | Apr-99 | 0.0 | 5.0 | 0.0 | 79.0 | 2013-09-01 | Sep-13 | 0.0 | 9.400000 | 84.0 | Apr-13 | 2013-04-01 | 1.0 | < 1 year | Ryder | NaN | NaN | NaN | NaN | NaN | Borrower added on 12/22/11 > I plan to use t... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# df[missing_cols].isnull().sum()
df.shape
(466285, 87)
# drop columns if all of them are nans
df = df.dropna(how='all',axis=1)
df.isnull().sum()[lambda x: x>0]
emp_title 27588 emp_length 21008 annual_inc 4 desc 340302 title 20 delinq_2yrs 29 earliest_cr_line 29 inq_last_6mths 29 mths_since_last_delinq 250351 mths_since_last_record 403647 open_acc 29 pub_rec 29 revol_util 340 total_acc 29 last_pymnt_d 376 next_pymnt_d 227214 last_credit_pull_d 42 collections_12_mths_ex_med 145 mths_since_last_major_derog 367311 acc_now_delinq 29 tot_coll_amt 70276 tot_cur_bal 70276 total_rev_hi_lim 70276 earliest_cr_line_date 29 last_pymnt_d_date 376 last_credit_pull_d_date 42 mths_since_earliest_cr_line 29 mths_since_last_pymnt_d 376 mths_since_last_credit_pull_d 42 emp_length_int 21008 loan_income_ratio 4 dtype: int64
missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()
df[missing_cols].head(2)
| loan_income_ratio | annual_inc | title | acc_now_delinq | total_acc | pub_rec | mths_since_earliest_cr_line | open_acc | earliest_cr_line | delinq_2yrs | inq_last_6mths | earliest_cr_line_date | mths_since_last_credit_pull_d | last_credit_pull_d | last_credit_pull_d_date | collections_12_mths_ex_med | revol_util | mths_since_last_pymnt_d | last_pymnt_d_date | last_pymnt_d | emp_length_int | emp_length | emp_title | tot_cur_bal | tot_coll_amt | total_rev_hi_lim | next_pymnt_d | mths_since_last_delinq | desc | mths_since_last_major_derog | mths_since_last_record | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.208333 | 24000.0 | Computer | 0.0 | 9.0 | 0.0 | 423.0 | 3.0 | Jan-85 | 0.0 | 1.0 | 1985-01-01 | 51.0 | Jan-16 | 2016-01-01 | 0.0 | 83.699997 | 63.0 | 2015-01-01 | Jan-15 | 10.0 | 10+ years | NaN | NaN | NaN | NaN | NaN | NaN | Borrower added on 12/22/11 > I need to upgra... | NaN | NaN |
| 1 | 0.083333 | 30000.0 | bike | 0.0 | 4.0 | 0.0 | 252.0 | 3.0 | Apr-99 | 0.0 | 5.0 | 1999-04-01 | 79.0 | Sep-13 | 2013-09-01 | 0.0 | 9.400000 | 84.0 | 2013-04-01 | Apr-13 | 1.0 | < 1 year | Ryder | NaN | NaN | NaN | NaN | NaN | Borrower added on 12/22/11 > I plan to use t... | NaN | NaN |
columns = ['feature','n_missing','top_value',
'proportion','nunique']
df_missing = pd.DataFrame(columns=columns)
for col in missing_cols:
x = df[col].value_counts(normalize=True).round(2)
unq = df[col].nunique()
n_missing = df[col].isnull().sum()
row = [col,n_missing, x.index[0],x.iloc[0],unq]
df_missing.loc[len(df_missing)] = row
df_missing = df_missing.set_index('feature')
df_missing
| n_missing | top_value | proportion | nunique | |
|---|---|---|---|---|
| feature | ||||
| loan_income_ratio | 4 | 0.2 | 0.02 | 77022 |
| annual_inc | 4 | 60000 | 0.04 | 31901 |
| title | 20 | Debt consolidation | 0.35 | 63099 |
| acc_now_delinq | 29 | 0 | 1.00 | 6 |
| total_acc | 29 | 21 | 0.04 | 112 |
| pub_rec | 29 | 0 | 0.87 | 26 |
| mths_since_earliest_cr_line | 29 | 234 | 0.01 | 664 |
| open_acc | 29 | 9 | 0.09 | 62 |
| earliest_cr_line | 29 | Oct-00 | 0.01 | 664 |
| delinq_2yrs | 29 | 0 | 0.82 | 24 |
| inq_last_6mths | 29 | 0 | 0.52 | 28 |
| earliest_cr_line_date | 29 | 2000-10-01 00:00:00 | 0.01 | 664 |
| mths_since_last_credit_pull_d | 42 | 51 | 0.70 | 103 |
| last_credit_pull_d | 42 | Jan-16 | 0.70 | 103 |
| last_credit_pull_d_date | 42 | 2016-01-01 00:00:00 | 0.70 | 103 |
| collections_12_mths_ex_med | 145 | 0 | 0.99 | 9 |
| revol_util | 340 | 0 | 0.00 | 1269 |
| mths_since_last_pymnt_d | 376 | 51 | 0.39 | 98 |
| last_pymnt_d_date | 376 | 2016-01-01 00:00:00 | 0.39 | 98 |
| last_pymnt_d | 376 | Jan-16 | 0.39 | 98 |
| emp_length_int | 21008 | 10 | 0.34 | 10 |
| emp_length | 21008 | 10+ years | 0.34 | 11 |
| emp_title | 27588 | Teacher | 0.01 | 205475 |
| tot_cur_bal | 70276 | 0 | 0.00 | 220690 |
| tot_coll_amt | 70276 | 0 | 0.87 | 6321 |
| total_rev_hi_lim | 70276 | 15000 | 0.00 | 14612 |
| next_pymnt_d | 227214 | Feb-16 | 0.87 | 100 |
| mths_since_last_delinq | 250351 | 9 | 0.02 | 145 |
| desc | 340302 | 0.00 | 124436 | |
| mths_since_last_major_derog | 367311 | 45 | 0.02 | 162 |
| mths_since_last_record | 403647 | 0 | 0.02 | 123 |
df['annual_inc'].isna().sum()
4
df['annual_inc'] = df['annual_inc'].fillna(df['annual_inc'].mean())
df['total_rev_hi_lim'] = df['total_rev_hi_lim'].fillna(df['funded_amnt'])
cols_zero = ['mths_since_earliest_cr_line','acc_now_delinq',
'total_acc','pub_rec','open_acc','inq_last_6mths',
'delinq_2yrs','emp_length_int']
for col in cols_zero:
df[col] = df[col].fillna(0)
df.select_dtypes('object').nunique().sort_values()
application_type 1 initial_list_status 2 pymnt_plan 2 term 2 verification_status 3 home_ownership 6 grade 7 loan_status 9 emp_length 11 purpose 14 sub_grade 35 addr_state 50 issue_d 91 last_pymnt_d 98 next_pymnt_d 100 last_credit_pull_d 103 earliest_cr_line 664 zip_code 888 title 63099 desc 124436 emp_title 205475 url 466285 dtype: int64
pd.get_dummies(df['grade'],prefix='grade',prefix_sep=':').head(2)
| grade:A | grade:B | grade:C | grade:D | grade:E | grade:F | grade:G | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
cols_dummy = ['grade','sub_grade','home_ownership',
'verification_status','loan_status',
'purpose','addr_state','initial_list_status']
df_dummies = [pd.get_dummies(df[col],
prefix=col,prefix_sep = ':')
for col in cols_dummy]
df_dummies = pd.concat(df_dummies,axis=1)
print(df_dummies.shape)
df_dummies.head(2)
(466285, 126)
| grade:A | grade:B | grade:C | grade:D | grade:E | grade:F | grade:G | sub_grade:A1 | sub_grade:A2 | sub_grade:A3 | sub_grade:A4 | sub_grade:A5 | sub_grade:B1 | sub_grade:B2 | sub_grade:B3 | sub_grade:B4 | sub_grade:B5 | sub_grade:C1 | sub_grade:C2 | sub_grade:C3 | sub_grade:C4 | sub_grade:C5 | sub_grade:D1 | sub_grade:D2 | sub_grade:D3 | sub_grade:D4 | sub_grade:D5 | sub_grade:E1 | sub_grade:E2 | sub_grade:E3 | sub_grade:E4 | sub_grade:E5 | sub_grade:F1 | sub_grade:F2 | sub_grade:F3 | sub_grade:F4 | sub_grade:F5 | sub_grade:G1 | sub_grade:G2 | sub_grade:G3 | sub_grade:G4 | sub_grade:G5 | home_ownership:ANY | home_ownership:MORTGAGE | home_ownership:NONE | home_ownership:OTHER | home_ownership:OWN | home_ownership:RENT | verification_status:Not Verified | verification_status:Source Verified | verification_status:Verified | loan_status:Charged Off | loan_status:Current | loan_status:Default | loan_status:Does not meet the credit policy. Status:Charged Off | loan_status:Does not meet the credit policy. Status:Fully Paid | loan_status:Fully Paid | loan_status:In Grace Period | loan_status:Late (16-30 days) | loan_status:Late (31-120 days) | purpose:car | purpose:credit_card | purpose:debt_consolidation | purpose:educational | purpose:home_improvement | purpose:house | purpose:major_purchase | purpose:medical | purpose:moving | purpose:other | purpose:renewable_energy | purpose:small_business | purpose:vacation | purpose:wedding | addr_state:AK | addr_state:AL | addr_state:AR | addr_state:AZ | addr_state:CA | addr_state:CO | addr_state:CT | addr_state:DC | addr_state:DE | addr_state:FL | addr_state:GA | addr_state:HI | addr_state:IA | addr_state:ID | addr_state:IL | addr_state:IN | addr_state:KS | addr_state:KY | addr_state:LA | addr_state:MA | addr_state:MD | addr_state:ME | addr_state:MI | addr_state:MN | addr_state:MO | addr_state:MS | addr_state:MT | addr_state:NC | addr_state:NE | addr_state:NH | addr_state:NJ | addr_state:NM | addr_state:NV | addr_state:NY | addr_state:OH | addr_state:OK | addr_state:OR | addr_state:PA | addr_state:RI | addr_state:SC | addr_state:SD | addr_state:TN | addr_state:TX | addr_state:UT | addr_state:VA | addr_state:VT | addr_state:WA | addr_state:WI | addr_state:WV | addr_state:WY | initial_list_status:f | initial_list_status:w | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
df = pd.concat([df,df_dummies],axis=1)
print(df.shape)
df.head(2)
(466285, 196)
| Unnamed: 0 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | acc_now_delinq | tot_coll_amt | tot_cur_bal | total_rev_hi_lim | issue_d_date | earliest_cr_line_date | last_pymnt_d_date | last_credit_pull_d_date | mths_since_issue_d | mths_since_earliest_cr_line | mths_since_last_pymnt_d | mths_since_last_credit_pull_d | term_int | emp_length_int | loan_income_ratio | good_bad | grade:A | grade:B | grade:C | grade:D | grade:E | grade:F | grade:G | sub_grade:A1 | sub_grade:A2 | sub_grade:A3 | sub_grade:A4 | sub_grade:A5 | sub_grade:B1 | sub_grade:B2 | sub_grade:B3 | sub_grade:B4 | sub_grade:B5 | sub_grade:C1 | sub_grade:C2 | sub_grade:C3 | sub_grade:C4 | sub_grade:C5 | sub_grade:D1 | sub_grade:D2 | sub_grade:D3 | sub_grade:D4 | sub_grade:D5 | sub_grade:E1 | sub_grade:E2 | sub_grade:E3 | sub_grade:E4 | sub_grade:E5 | sub_grade:F1 | sub_grade:F2 | sub_grade:F3 | sub_grade:F4 | sub_grade:F5 | sub_grade:G1 | sub_grade:G2 | sub_grade:G3 | sub_grade:G4 | sub_grade:G5 | home_ownership:ANY | home_ownership:MORTGAGE | home_ownership:NONE | home_ownership:OTHER | home_ownership:OWN | home_ownership:RENT | verification_status:Not Verified | verification_status:Source Verified | verification_status:Verified | loan_status:Charged Off | loan_status:Current | loan_status:Default | loan_status:Does not meet the credit policy. Status:Charged Off | loan_status:Does not meet the credit policy. Status:Fully Paid | loan_status:Fully Paid | loan_status:In Grace Period | loan_status:Late (16-30 days) | loan_status:Late (31-120 days) | purpose:car | purpose:credit_card | purpose:debt_consolidation | purpose:educational | purpose:home_improvement | purpose:house | purpose:major_purchase | purpose:medical | purpose:moving | purpose:other | purpose:renewable_energy | purpose:small_business | purpose:vacation | purpose:wedding | addr_state:AK | addr_state:AL | addr_state:AR | addr_state:AZ | addr_state:CA | addr_state:CO | addr_state:CT | addr_state:DC | addr_state:DE | addr_state:FL | addr_state:GA | addr_state:HI | addr_state:IA | addr_state:ID | addr_state:IL | addr_state:IN | addr_state:KS | addr_state:KY | addr_state:LA | addr_state:MA | addr_state:MD | addr_state:ME | addr_state:MI | addr_state:MN | addr_state:MO | addr_state:MS | addr_state:MT | addr_state:NC | addr_state:NE | addr_state:NH | addr_state:NJ | addr_state:NM | addr_state:NV | addr_state:NY | addr_state:OH | addr_state:OK | addr_state:OR | addr_state:PA | addr_state:RI | addr_state:SC | addr_state:SD | addr_state:TN | addr_state:TX | addr_state:UT | addr_state:VA | addr_state:VT | addr_state:WA | addr_state:WI | addr_state:WV | addr_state:WY | initial_list_status:f | initial_list_status:w | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1077501 | 1296599 | 5000 | 5000 | 4975.0 | 36 months | 10.65 | 162.869995 | B | B2 | NaN | 10+ years | RENT | 24000.0 | Verified | Dec-11 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.... | Borrower added on 12/22/11 > I need to upgra... | credit_card | Computer | 860xx | AZ | 27.65 | 0.0 | Jan-85 | 1.0 | NaN | NaN | 3.0 | 0.0 | 13648 | 83.699997 | 9.0 | f | 0.0 | 0.0 | 5861.071289 | 5831.779785 | 5000.000000 | 861.070007 | 0.0 | 0.000000 | 0.00 | Jan-15 | 171.619995 | NaN | Jan-16 | 0.0 | NaN | 1 | INDIVIDUAL | 0.0 | NaN | NaN | 5000.0 | 2011-12-01 | 1985-01-01 | 2015-01-01 | 2016-01-01 | 100.0 | 423.0 | 63.0 | 51.0 | 36 | 10.0 | 0.208333 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 1 | 1077430 | 1314167 | 2500 | 2500 | 2500.0 | 60 months | 15.27 | 59.830002 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | Source Verified | Dec-11 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.... | Borrower added on 12/22/11 > I plan to use t... | car | bike | 309xx | GA | 1.00 | 0.0 | Apr-99 | 5.0 | NaN | NaN | 3.0 | 0.0 | 1687 | 9.400000 | 4.0 | f | 0.0 | 0.0 | 1008.710022 | 1008.710022 | 456.459991 | 435.170013 | 0.0 | 117.080002 | 1.11 | Apr-13 | 119.660004 | NaN | Sep-13 | 0.0 | NaN | 1 | INDIVIDUAL | 0.0 | NaN | NaN | 2500.0 | 2011-12-01 | 1999-04-01 | 2013-04-01 | 2013-09-01 | 100.0 | 252.0 | 84.0 | 79.0 | 60 | 1.0 | 0.083333 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
df.isnull().sum().sum()
1779794
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df.drop('good_bad', axis = 1),
df['good_bad'],
train_size=0.8,
random_state=SEED
)
df_Xtrain.shape, df_Xtest.shape, ser_ytrain.shape, ser_ytest.shape
((373028, 195), (93257, 195), (373028,), (93257,))
X_prep = df_Xtrain
y_prep = ser_ytrain
# In second round, we will comment train part and
# uncomment this test part.
#
# Note: after doing test part, comment test part and
# and keep train part uncommented, since we need to
# create features based on train not test.
#
# X_prep = df_Xtest
# y_prep = ser_ytest
def get_woe_iv(df1,
ser_target,
feature_cat,
target_name='good_bad',
sort='woe',
return_all=False):
"""Calculate Weight of Evidence and Infomation Value.
Parameters
-----------
df1: pandas.DataFrame
Input dataframe having categorical feature.
ser_target: pandas.Series
Pandas Series of binary values. e.g y_prep
feature_cat: str
Name of categorical column
target_name: str
Name of target column.
sort: str
Sort the output dataframe by this column.
Default is 'woe'. We use 'woe' for discrete
columns.
If the column values are continuous, then
use the actual column name for sorting.
return_all: boolean, optional
Whether or not return all columns
Returns:
--------
Returns dataframe with WOE, IV and other quantities.
Example:
--------
df_woe = get_woe_iv(X_prep,'grade',y_prep)
"""
df1 = pd.DataFrame({
feature_cat: df1[feature_cat],
target_name: ser_target
})
df1 = df1.groupby(feature_cat).agg(
n_obs=(target_name,'count'),
prop_good=(target_name,'mean'))
df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()
df1['n_good'] = df1['prop_good'] * df1['n_obs']
df1['n_bad'] = (1-df1['prop_good']) * df1['n_obs']
df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()
df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()
df1['woe'] = np.log(df1['prop_n_good']/df1['prop_n_bad'])
df1['diff_prop_good'] = df1['prop_good'].diff().abs()
df1['diff_woe'] = df1['woe'].diff().abs()
df1['diff_n_obs'] = df1['n_obs'].diff(-1).abs()
df1['iv'] = ((df1['prop_n_good']-df1['prop_n_bad']
) * df1['woe']
).sum()
# reorder columns
if not return_all:
df1 = df1[['n_obs', 'woe', 'diff_n_obs' ]]
if return_all:
df1 = df1[['n_obs', 'woe', 'diff_n_obs',
'prop_n_obs','iv',
'n_good', 'n_bad',
'prop_n_good','prop_n_bad',
'prop_good',
'diff_prop_good', 'diff_woe' ]]
# add one more column
n_obs1 = df1['n_obs'].apply(lambda x: "{:,d}".format(x)).astype(str)
df1['text_plotly'] = (
'n_obs: ' + n_obs1 + '<br>' +
'WoE: ' + df1['woe'].round(4).astype(str) + '<br>'
)
# sort values
df1 = df1.sort_values(sort)
df1 = df1.reset_index()
return df1
df_woe = get_woe_iv(X_prep,y_prep,'grade')
df_woe.head().style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})
| grade | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | G | 703 | -1.258162 | nan | n_obs: 703 WoE: -1.2582 |
| 1 | F | 2,641 | -0.922705 | 1,938 | n_obs: 2,641 WoE: -0.9227 |
| 2 | E | 7,084 | -0.692380 | 4,443 | n_obs: 7,084 WoE: -0.6924 |
| 3 | D | 15,294 | -0.396822 | 8,210 | n_obs: 15,294 WoE: -0.3968 |
| 4 | C | 24,982 | -0.054636 | 9,688 | n_obs: 24,982 WoE: -0.0546 |
def plot_woe(df_woe,xrot=0,figsize=(18,12),fontsize=20,color='red'):
x = df_woe.iloc[:,0].to_numpy().astype(str)
y = df_woe['woe']
fig,ax = plt.subplots(1,1,figsize=figsize)
plt.plot(x,y,marker='o',linestyle='--',color='k')
plt.xlabel(df_woe.columns[0],fontsize=fontsize)
plt.ylabel('Weight of Evidence',fontsize=fontsize)
plt.xticks(rotation=xrot,fontsize=fontsize,color=color)
plt.yticks(fontsize=fontsize)
plt.grid(color='k',linestyle='-.')
for i,txt in enumerate(df_woe['n_obs']):
txt = ' {:,.0f}'.format(txt)
try: # sometimes I get error in y[i]
plt.annotate(txt, (x[i],y[i]),
rotation=90,fontsize=18,color='blue')
except:
pass
def plot_woe_plotly(df_woe):
col = df_woe.columns.tolist()[0]
x = df_woe.iloc[:,0].astype(str)
y = df_woe['woe']
z = df_woe['text_plotly']
data = go.Scatter(x=x,y=y,text=z,mode='lines+markers')
fig = go.Figure(data=data)
fig.update_layout(
title=f"WoE Curve for **{col}**",
xaxis_title=f"{col}",
yaxis_title="Weight of Evidence",
)
# also display dataframe
df_style = df_woe.style\
.background_gradient(subset=['n_obs'],
cmap='Blues')\
.background_gradient(subset=['woe'],
cmap='Reds')\
.format({'n_obs':"{:,d}",
'diff_n_obs':"{:,.0f}"})
display(df_style)
fig.show()
def create_bins_dummies_cont(X_prep,col,bins):
"""Creates new dummy columns in-place in given dataframe.
For this continuous bins,
left numbers are inclusive.
Example:
========
bins = [0,9.961,12.025, 15.74, 20.281]
col = 'int_rate'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
Outputs:
========
int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281
"""
nbins = len(bins)
ser = X_prep[col]
for i in range(nbins):
if i < (nbins-1):
a,b = bins[i], bins[i+1]
name = col + f':{a}_{b}'
print(name)
# condition
cond = (ser>=a) & (ser<b)
X_prep[name] = np.where(cond,1,0)
elif i == (nbins-1):
b = bins[i]
name = col + ':>=' + str(b)
print(name)
# condition
cond = ser >= b
X_prep[name] = np.where(cond,1,0)
def create_bins_dummies_int(X_prep,col,bins):
"""Creates new dummy columns in-place in given dataframe.
For this integer bins,
both left and right numbers are included.
Example:
---------
col = 'mths_since_last_delinq'
bins = [0,3,30,56]
create_bins_dummies_integer(X_prep,col,bins)
Outputs:
--------
mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57
"""
nbins = len(bins)
ser = X_prep[col]
# assert last number of bin is integer
assert isinstance(bins[-1],int)
for i in range(nbins):
if i == 0:
a,b = bins[i], bins[i+1]
name = col + f':{a}_{b}'
print(name)
# condition
cond = (ser>=a) & (ser<=b)
X_prep[name] = np.where(cond,1,0)
elif i < (nbins-1):
a,b = bins[i]+1, bins[i+1]
name = col + f':{a}_{b}'
print(name)
# condition
cond = (ser>=a) & (ser<=b)
X_prep[name] = np.where(cond,1,0)
elif i == (nbins-1):
b = bins[i]+1
name = col + ':>=' + str(b)
print(name)
# condition
cond = ser >= b
X_prep[name] = np.where(cond,1,0)
def create_dummies_left_inclusive(X_prep,col,bins,labels):
nlabels = len(labels)
for i in range(nlabels):
name = col + ':' + labels[i]
ser = X_prep[col]
if i == 0:
cond = (ser <= bins[i])
X_prep[name] = np.where(cond,1,0)
print(name)
elif (i>0) & (i<(nlabels-1)):
cond = (ser >= bins[i-1]) & (ser < bins[i])
X_prep[name] = np.where(cond,1,0)
print(name)
elif i == (nlabels-1):
cond = (ser > bins[i-1])
X_prep[name] = np.where(cond,1,0)
print(name)
col = 'grade'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 7 nulls = 0
193817 B 174039 C Name: grade, dtype: object
df[col].value_counts().sort_index()
A 74867 B 136929 C 125293 D 76888 E 35757 F 13229 G 3322 Name: grade, dtype: int64
"""
as we expect, lower the grade, higher is the probabilty of default.
here, we will not group grades into grade:G_F and so on.
we already have one hot encoding of grade, so we are fine.
grade:A
grade:B
grade:C
grade:D
grade:E
grade:F
""";
col = 'home_ownership'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 5 nulls = 0
193817 RENT 174039 OWN Name: home_ownership, dtype: object
cols = [i for i in X_prep.columns if 'home_ownership' in i]
X_prep[cols].head(2)
| home_ownership | home_ownership:ANY | home_ownership:MORTGAGE | home_ownership:NONE | home_ownership:OTHER | home_ownership:OWN | home_ownership:RENT | |
|---|---|---|---|---|---|---|---|
| 193817 | RENT | 0 | 0 | 0 | 0 | 0 | 1 |
| 174039 | OWN | 0 | 0 | 0 | 0 | 1 | 0 |
df_woe = get_woe_iv(X_prep,y_prep,'home_ownership')
plot_woe_plotly(df_woe)
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/io/formats/style.py:1093: RuntimeWarning: invalid value encountered in double_scalars /Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/matplotlib/colors.py:527: RuntimeWarning: invalid value encountered in less
| home_ownership | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | OTHER | 37 | -0.241722 | 8,297 | n_obs: 37 WoE: -0.2417 |
| 1 | RENT | 37,331 | -0.158371 | nan | n_obs: 37,331 WoE: -0.1584 |
| 2 | OWN | 8,334 | -0.010046 | 28,997 | n_obs: 8,334 WoE: -0.01 |
| 3 | MORTGAGE | 47,544 | 0.141610 | 47,533 | n_obs: 47,544 WoE: 0.1416 |
| 4 | NONE | 11 | inf | 26 | n_obs: 11 WoE: inf |
"""
home_ownership:RENT_OTHER_NONE_ANY
home_ownership:OWN
home_ownership:MORTGAGE
""";
X_prep['home_ownership:RENT_OTHER_NONE_ANY'] = \
sum([X_prep['home_ownership:RENT'],
X_prep['home_ownership:OTHER'], X_prep['home_ownership:NONE'],
X_prep['home_ownership:ANY']])
col = 'addr_state'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 49 nulls = 0
193817 CA 174039 WA Name: addr_state, dtype: object
cols = [i for i in X_prep.columns if 'addr_state' in i]
X_prep[cols].head(2)
| addr_state | addr_state:AK | addr_state:AL | addr_state:AR | addr_state:AZ | addr_state:CA | addr_state:CO | addr_state:CT | addr_state:DC | addr_state:DE | addr_state:FL | addr_state:GA | addr_state:HI | addr_state:IA | addr_state:ID | addr_state:IL | addr_state:IN | addr_state:KS | addr_state:KY | addr_state:LA | addr_state:MA | addr_state:MD | addr_state:ME | addr_state:MI | addr_state:MN | addr_state:MO | addr_state:MS | addr_state:MT | addr_state:NC | addr_state:NE | addr_state:NH | addr_state:NJ | addr_state:NM | addr_state:NV | addr_state:NY | addr_state:OH | addr_state:OK | addr_state:OR | addr_state:PA | addr_state:RI | addr_state:SC | addr_state:SD | addr_state:TN | addr_state:TX | addr_state:UT | addr_state:VA | addr_state:VT | addr_state:WA | addr_state:WI | addr_state:WV | addr_state:WY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 193817 | CA | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 174039 | WA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
X_prep['addr_state'].unique()
array(['CA', 'WA', 'CO', 'NY', 'PA', 'GA', 'TX', 'OK', 'KY', 'FL', 'MS',
'NC', 'MD', 'MO', 'OH', 'LA', 'HI', 'SC', 'RI', 'MI', 'NJ', 'TN',
'MA', 'IL', 'UT', 'WI', 'VA', 'NV', 'MT', 'AL', 'CT', 'AZ', 'IN',
'DC', 'DE', 'MN', 'OR', 'NM', 'AR', 'KS', 'NH', 'WV', 'AK', 'VT',
'SD', 'WY', 'ID', 'IA', 'NE'], dtype=object)
train_state_names = ['GA', 'PA', 'WI', 'CA', 'FL', 'CT', 'TX', 'MA', 'OR', 'HI', 'MO',
'MS', 'NY', 'SC', 'WV', 'IL', 'NJ', 'AZ', 'MN', 'CO', 'TN', 'AL',
'LA', 'OH', 'VA', 'NV', 'WA', 'NC', 'WY', 'OK', 'MI', 'DC', 'NM',
'AK', 'MD', 'AR', 'KS', 'IN', 'UT', 'DE', 'NH', 'KY', 'RI', 'MT',
'VT', 'SD', 'NE', 'ID', 'ME', 'IA']
X_prep['addr_state'].nunique()
49
"""
We have 50 states and one DC, so, we should have 51 values.
one state is missing. and the missing state is ND.
""";
state_names51 = ['AK','AL','AR','AZ','CA','CO','CT',
'DC',
'DE','FL','GA','HI','IA','ID','IL',
'IN','KS','KY','LA','MA','MD','ME',
'MI','MN','MO','MS','MT','NC', 'ND',
'NE','NH','NJ','NM','NV','NY','OH',
'OK','OR','PA','RI','SC','SD','TN',
'TX','UT','VA','VT','WA','WI','WV',
'WY']
len(state_names51)
51
missing_states = [i for i in state_names51 if i not in
train_state_names]
missing_states
['ND']
for col in missing_states:
name = 'addr_state:' + col
if name in X_prep.columns:
pass
else:
X_prep[name] = 0
# df_woe = get_woe_iv(X_prep,y_prep,'addr_state')
# plot_woe_plotly(df_woe)
"""
** do not forget the missing states.
We create the following categories:
'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'
'NM' 'VA'
'NY'
'OK' 'TN' 'MO' 'LA' 'MD' 'NC'
'CA'
'UT' 'KY' 'AZ' 'NJ'
'AR' 'MI' 'PA' 'OH' 'MN'
'RI' 'MA' 'DE' 'SD' 'IN'
'GA' 'WA' 'OR'
'WI' 'MT'
'TX'
'IL' 'CT'
'KS' 'SC' 'CO' 'VT' 'AK' 'MS'
'WV' 'NH' 'WY' 'DC' 'ME' 'ID'
'ND_NE_IA_NV_FL_HI_AL' will be the reference category.
""";
multiple = [
'ND_NE_IA_NV_FL_HI_AL',
'NM_VA',
'OK_TN_MO_LA_MD_NC',
'UT_KY_AZ_NJ',
'AR_MI_PA_OH_MN',
'RI_MA_DE_SD_IN',
'GA_WA_OR',
'WI_MT',
'IL_CT',
'KS_SC_CO_VT_AK_MS',
'WV_NH_WY_DC_ME_ID'
]
dummies = ['addr_state:'+i for i in multiple]
for d in dummies:
print(d)
addr_state:ND_NE_IA_NV_FL_HI_AL addr_state:NM_VA addr_state:OK_TN_MO_LA_MD_NC addr_state:UT_KY_AZ_NJ addr_state:AR_MI_PA_OH_MN addr_state:RI_MA_DE_SD_IN addr_state:GA_WA_OR addr_state:WI_MT addr_state:IL_CT addr_state:KS_SC_CO_VT_AK_MS addr_state:WV_NH_WY_DC_ME_ID
"""
Store these dummies:
addr_state:ND_NE_IA_NV_FL_HI_AL
addr_state:NM_VA
addr_state:OK_TN_MO_LA_MD_NC
addr_state:UT_KY_AZ_NJ
addr_state:AR_MI_PA_OH_MN
addr_state:RI_MA_DE_SD_IN
addr_state:GA_WA_OR
addr_state:WI_MT
addr_state:IL_CT
addr_state:KS_SC_CO_VT_AK_MS
addr_state:WV_NH_WY_DC_ME_ID
""";
sep = '_'
one = 'ND_NE_IA_NV_FL_HI_AL'
lst_one = one.split(sep)
lst_one
['ND', 'NE', 'IA', 'NV', 'FL', 'HI', 'AL']
colname = 'addr_state'
new_cat = colname + ':' + one
new_cat
'addr_state:ND_NE_IA_NV_FL_HI_AL'
lst = [X_prep[colname + ':' + i] for i in lst_one]
sum(lst).head(2)
193817 0 174039 0 dtype: int64
colname = 'addr_state'
sep = '_'
for one in multiple:
lst_one = one.split(sep)
new_cat = colname + ':' + one
lst = [X_prep[colname + ':' + i] for i in lst_one]
X_prep[new_cat] = sum(lst)
X_prep.iloc[:2,-2:]
| addr_state:KS_SC_CO_VT_AK_MS | addr_state:WV_NH_WY_DC_ME_ID | |
|---|---|---|
| 193817 | 0 | 0 |
| 174039 | 0 | 0 |
col = 'verification_status'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 3 nulls = 0
193817 Verified 174039 Not Verified Name: verification_status, dtype: object
X_prep[col].value_counts()
Verified 33473 Source Verified 29955 Not Verified 29829 Name: verification_status, dtype: int64
cols = [i for i in X_prep.columns if 'verification_status' in i]
X_prep[cols].head(2)
| verification_status | verification_status:Not Verified | verification_status:Source Verified | verification_status:Verified | |
|---|---|---|---|---|
| 193817 | Verified | 0 | 0 | 1 |
| 174039 | Not Verified | 1 | 0 | 0 |
for col in cols:
print(col)
verification_status verification_status:Not Verified verification_status:Source Verified verification_status:Verified
"""
verification_status:Not Verified
verification_status:Source Verified
verification_status:Verified
""";
# they all have significant n_obs.
# no need to group them.
col = 'purpose'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 14 nulls = 0
193817 debt_consolidation 174039 debt_consolidation Name: purpose, dtype: object
cols = [i for i in X_prep.columns if 'purpose' in i]
X_prep[cols].head(2)
| purpose | purpose:car | purpose:credit_card | purpose:debt_consolidation | purpose:educational | purpose:home_improvement | purpose:house | purpose:major_purchase | purpose:medical | purpose:moving | purpose:other | purpose:renewable_energy | purpose:small_business | purpose:vacation | purpose:wedding | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 193817 | debt_consolidation | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 174039 | debt_consolidation | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# df_woe = get_woe_iv(X_prep,y_prep,'purpose')
# plot_woe_plotly(df_woe)
dummies = \
"""
small_business__educational__moving__renewable_energy
other
house__medical__wedding__vacation
debt_consolidation
home_improvement__major_purchase__car
credit_card
""".strip().split('\n')
dummies = ['purpose:'+ i for i in dummies]
for d in dummies:
print(d)
purpose:small_business__educational__moving__renewable_energy purpose:other purpose:house__medical__wedding__vacation purpose:debt_consolidation purpose:home_improvement__major_purchase__car purpose:credit_card
multiple =[
'small_business__educational__moving__renewable_energy',
'house__medical__wedding__vacation',
'home_improvement__major_purchase__car'
]
colname = 'purpose'
sep = '__'
multiple =[
'small_business__educational__moving__renewable_energy',
'house__medical__wedding__vacation',
'home_improvement__major_purchase__car'
]
for one in multiple:
lst_one = one.split(sep)
new_cat = colname + ':' + one
lst = [X_prep[colname + ':' + i] for i in lst_one]
X_prep[new_cat] = sum(lst)
X_prep.iloc[:2,-2:]
| purpose:house__medical__wedding__vacation | purpose:home_improvement__major_purchase__car | |
|---|---|---|
| 193817 | 0 | 0 |
| 174039 | 0 | 0 |
col = 'initial_list_status'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 2 nulls = 0
193817 f 174039 f Name: initial_list_status, dtype: object
cols = [i for i in X_prep.columns if 'initial_list_status' in i]
X_prep[cols].head(2)
| initial_list_status | initial_list_status:f | initial_list_status:w | |
|---|---|---|---|
| 193817 | f | 1 | 0 |
| 174039 | f | 1 | 0 |
# we already have binary encoding.
dummies = """
initial_list_status:f
initial_list_status:w
""";
col = 'term_int'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 2 nulls = 0
193817 36 174039 36 Name: term_int, dtype: int64
X_prep['term:36'] = np.where((X_prep['term_int'] == 36), 1, 0)
X_prep['term:60'] = np.where((X_prep['term_int'] == 60), 1, 0)
dummies = \
"""
term:36
term:60
""";
col = 'emp_length_int'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 11 nulls = 0
193817 5.0 174039 1.0 Name: emp_length_int, dtype: float32
X_prep[col].value_counts()
10.0 30273 1.0 13123 2.0 8072 3.0 7245 5.0 6195 4.0 5620 6.0 5339 7.0 5118 8.0 4529 0.0 4131 9.0 3612 Name: emp_length_int, dtype: int64
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
| emp_length_int | |
|---|---|
| 193817 | 5.0 |
| 174039 | 1.0 |
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe_plotly(df_woe)
| emp_length_int | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | 0.000000 | 4,131 | -0.224541 | 8,992 | n_obs: 4,131 WoE: -0.2245 |
| 1 | 1.000000 | 13,123 | -0.058005 | 5,051 | n_obs: 13,123 WoE: -0.058 |
| 2 | 2.000000 | 8,072 | 0.027248 | 827 | n_obs: 8,072 WoE: 0.0272 |
| 3 | 3.000000 | 7,245 | -0.012977 | 1,625 | n_obs: 7,245 WoE: -0.013 |
| 4 | 4.000000 | 5,620 | 0.046928 | 575 | n_obs: 5,620 WoE: 0.0469 |
| 5 | 5.000000 | 6,195 | -0.014760 | 856 | n_obs: 6,195 WoE: -0.0148 |
| 6 | 6.000000 | 5,339 | -0.084708 | 221 | n_obs: 5,339 WoE: -0.0847 |
| 7 | 7.000000 | 5,118 | -0.065842 | 589 | n_obs: 5,118 WoE: -0.0658 |
| 8 | 8.000000 | 4,529 | -0.020316 | 917 | n_obs: 4,529 WoE: -0.0203 |
| 9 | 9.000000 | 3,612 | -0.042647 | 26,661 | n_obs: 3,612 WoE: -0.0426 |
| 10 | 10.000000 | 30,273 | 0.087623 | nan | n_obs: 30,273 WoE: 0.0876 |
"""
emp_length:0
emp_length:1
emp_length:2_3
emp_length:4_5
emp_length:6_7
emp_length:8_9
emp_length:10
""";
X_prep['emp_length:0'] = np.where(X_prep['emp_length_int'].isin([0]), 1, 0)
X_prep['emp_length:1'] = np.where(X_prep['emp_length_int'].isin([1]), 1, 0)
X_prep['emp_length:2_3'] = np.where(X_prep['emp_length_int'].isin([2,3]), 1, 0)
X_prep['emp_length:4_5'] = np.where(X_prep['emp_length_int'].isin([4,5]), 1, 0)
X_prep['emp_length:6_7'] = np.where(X_prep['emp_length_int'].isin([6,7]), 1, 0)
X_prep['emp_length:8_9'] = np.where(X_prep['emp_length_int'].isin([8,9]), 1, 0)
X_prep['emp_length:10'] = np.where(X_prep['emp_length_int'].isin([10]), 1, 0)
col = 'mths_since_issue_d'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 91 nulls = 0
193817 90.0 174039 87.0 Name: mths_since_issue_d, dtype: float64
X_prep[colf] = pd.cut(X_prep[col], 50)
X_prep[colf].head(2)
193817 (89.2, 91.0] 174039 (85.6, 87.4] Name: mths_since_issue_d_factor, dtype: category Categories (50, interval[float64]): [(63.91, 65.8] < (65.8, 67.6] < (67.6, 69.4] < (69.4, 71.2] ... (146.8, 148.6] < (148.6, 150.4] < (150.4, 152.2] < (152.2, 154.0]]
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
# plot_woe_plotly(df_woe)
bins = [0,66,70,75,82,93,128]
col = 'mths_since_issue_d'
create_bins_dummies_int(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
mths_since_issue_d:0_66 mths_since_issue_d:67_70 mths_since_issue_d:71_75 mths_since_issue_d:76_82 mths_since_issue_d:83_93 mths_since_issue_d:94_128 mths_since_issue_d:>=129
| mths_since_issue_d | mths_since_issue_d_factor | mths_since_issue_d:0_66 | mths_since_issue_d:67_70 | mths_since_issue_d:71_75 | mths_since_issue_d:76_82 | mths_since_issue_d:83_93 | mths_since_issue_d:94_128 | mths_since_issue_d:>=129 | |
|---|---|---|---|---|---|---|---|---|---|
| 193817 | 90.0 | (89.2, 91.0] | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 174039 | 87.0 | (85.6, 87.4] | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
# after 93, there are very few obervations
# we can group them all to one or make the two.
"""
mths_since_issue_d:0_66
mths_since_issue_d:67_70
mths_since_issue_d:71_75
mths_since_issue_d:76_82
mths_since_issue_d:83_93
mths_since_issue_d:94_128
mths_since_issue_d:>=129
""";
col = 'int_rate'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 477 nulls = 0
193817 10.16 174039 15.80 Name: int_rate, dtype: float32
X_prep[colf] = pd.cut(X_prep[col], 50)
X_prep[colf].head(2)
193817 (9.961, 10.374] 174039 (15.74, 16.153] Name: int_rate_factor, dtype: category Categories (50, interval[float64]): [(5.399, 5.833] < (5.833, 6.246] < (6.246, 6.658] < (6.658, 7.071] ... (24.409, 24.822] < (24.822, 25.234] < (25.234, 25.647] < (25.647, 26.06]]
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
bins = [0,9.961,12.025, 15.74, 20.281,27]
# we have monotonically decreasing curve, its easy to partition.
# first few point are zigzag, we suspect low n_obs.
# there is low n_obs, so bundle them up.
#
# look at graph, where woe drops sharply
"""
int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281
""";
bins = [0,9.961,12.025, 15.74, 20.281]
col = 'int_rate'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
int_rate:0_9.961 int_rate:9.961_12.025 int_rate:12.025_15.74 int_rate:15.74_20.281 int_rate:>=20.281
| int_rate | int_rate_factor | int_rate:0_9.961 | int_rate:9.961_12.025 | int_rate:12.025_15.74 | int_rate:15.74_20.281 | int_rate:>=20.281 | |
|---|---|---|---|---|---|---|---|
| 193817 | 10.16 | (9.961, 10.374] | 0 | 1 | 0 | 0 | 0 |
| 174039 | 15.80 | (15.74, 16.153] | 0 | 0 | 0 | 1 | 0 |
col = 'funded_amnt'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 1245 nulls = 0
193817 5500 174039 6000 Name: funded_amnt, dtype: int32
X_prep[colf] = pd.cut(X_prep[col], 50)
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
# woe is almost constant
# woe does not depend on dependent variable
# we can remove the variable from analysis.
col = 'mths_since_earliest_cr_line'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 613 nulls = 0
193817 156.0 174039 293.0 Name: mths_since_earliest_cr_line, dtype: float64
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
bins = [0,207, 350]
col = 'mths_since_earliest_cr_line'
create_bins_dummies_int(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
mths_since_earliest_cr_line:0_207 mths_since_earliest_cr_line:208_350 mths_since_earliest_cr_line:>=351
| mths_since_earliest_cr_line | mths_since_earliest_cr_line_factor | mths_since_earliest_cr_line:0_207 | mths_since_earliest_cr_line:208_350 | mths_since_earliest_cr_line:>=351 | |
|---|---|---|---|---|---|
| 193817 | 156.0 | (135.4, 159.38] | 1 | 0 | 0 |
| 174039 | 293.0 | (279.28, 303.26] | 0 | 1 | 0 |
"""
Ignore first few low numbers,
we see upward trend
mths_since_earliest_cr_line:0_207
mths_since_earliest_cr_line:208_350
mths_since_earliest_cr_line:>=351
""";
col = 'delinq_2yrs'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 19 nulls = 0
193817 0.0 174039 1.0 Name: delinq_2yrs, dtype: float32
X_prep[col].value_counts()
0.0 76625 1.0 11186 2.0 3286 3.0 1150 4.0 457 5.0 249 6.0 137 7.0 70 8.0 33 9.0 21 10.0 14 12.0 8 11.0 7 13.0 4 15.0 3 22.0 2 17.0 2 14.0 2 18.0 1 Name: delinq_2yrs, dtype: int64
# we do not want factor here.
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
"""
we might be attemped to ignore the feature
but this might be useful feature
we see lots of zeros, make it one feature.
then make 1-3 and greater than 3.
delinq_2yrs:0
delinq_2yrs:1_3
delinq_2yrs:>=3
""";
X_prep['delinq_2yrs:0'] = \
np.where((X_prep['delinq_2yrs'] ==0), 1, 0)
X_prep['delinq_2yrs:1-3'] = \
np.where((X_prep['delinq_2yrs'] >= 1) &
(X_prep['delinq_2yrs'] <= 3), 1, 0)
X_prep['delinq_2yrs:>=3'] = \
np.where((X_prep['delinq_2yrs'] > 3), 1, 0)
col = 'inq_last_6mths'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 16 nulls = 0
193817 1.0 174039 0.0 Name: inq_last_6mths, dtype: float32
X_prep[col].value_counts()
0.0 48486 1.0 25848 2.0 11517 3.0 5028 4.0 1489 5.0 551 6.0 247 7.0 42 8.0 23 9.0 12 10.0 5 11.0 3 12.0 2 15.0 2 14.0 1 24.0 1 Name: inq_last_6mths, dtype: int64
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log /Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
"""
we see lots of zeros, make it one feature.
then make 1-3 and greater than 3.
inq_last_6mths:0
inq_last_6mths:1
inq_last_6mths:2_3
inq_last_6mths:>=3
""";
X_prep['inq_last_6mths:0'] = \
np.where((X_prep['inq_last_6mths'] ==0), 1, 0)
X_prep['inq_last_6mths:1'] = \
np.where((X_prep['inq_last_6mths'] ==1), 1, 0)
X_prep['inq_last_6mths:2_3'] = \
np.where((X_prep['inq_last_6mths'] > 1) &
(X_prep['inq_last_6mths'] <= 3), 1, 0)
X_prep['inq_last_6mths:>3'] = \
np.where((X_prep['inq_last_6mths'] > 3), 1, 0)
col = 'open_acc'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 55 nulls = 0
193817 5.0 174039 7.0 Name: open_acc, dtype: float32
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log /Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
X_prep[col].value_counts().sort_index()[lambda x: x>600]
3.0 1125 4.0 2562 5.0 4199 6.0 6183 7.0 7520 8.0 8415 9.0 8766 10.0 8451 11.0 7867 12.0 6854 13.0 5983 14.0 5031 15.0 3997 16.0 3334 17.0 2656 18.0 2206 19.0 1754 20.0 1349 21.0 1048 22.0 790 23.0 658 Name: open_acc, dtype: int64
X_prep[col+':<=3'] = \
np.where((X_prep[col] <=3), 1, 0)
X_prep[col+':4_5'] = \
np.where((X_prep[col] > 3) &
(X_prep[col] <= 5), 1, 0)
X_prep[col+':6_13'] = \
np.where((X_prep[col] > 5) &
(X_prep[col] <= 13), 1, 0)
X_prep[col+':13_17'] = \
np.where((X_prep[col] > 13) &
(X_prep[col] <= 17), 1, 0)
X_prep[col+':>17'] = \
np.where((X_prep[col] > 17), 1, 0)
"""
There are many small number and woe curve is flat.
look at n_obs and group them.
open_acc:<=3
open_acc:4_5
open_acc:6_13
open_acc:13_17
open_acc:>17
""";
# df_woe
col = 'pub_rec'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 17 nulls = 0
193817 0.0 174039 0.0 Name: pub_rec, dtype: float32
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
X_prep[col].value_counts().sort_index()
0.0 81014 1.0 10635 2.0 1098 3.0 322 4.0 89 5.0 45 6.0 26 7.0 10 8.0 7 9.0 2 10.0 2 11.0 2 13.0 1 16.0 1 17.0 1 18.0 1 19.0 1 Name: pub_rec, dtype: int64
X_prep[col+':0'] = \
np.where((X_prep[col] ==0), 1, 0)
X_prep[col+':1'] = \
np.where((X_prep[col] ==1), 1, 0)
X_prep[col+':2'] = \
np.where((X_prep[col] ==2), 1, 0)
X_prep[col+':3'] = \
np.where((X_prep[col] ==3), 1, 0)
X_prep[col+':>3'] = \
np.where((X_prep[col] > 3), 1, 0)
"""
pub_rec:0
pub_rec:1
pub_rec:2
pub_rec:3
pub_rec:>3
""";
col = 'total_acc'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 99 nulls = 0
193817 6.0 174039 16.0 Name: total_acc, dtype: float32
X_prep[col].value_counts()[lambda x: x>9000]
Series([], Name: total_acc, dtype: int64)
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
bins = [0,6.24,21.84,37.44,40.56]
col = 'total_acc'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
total_acc:0_6.24 total_acc:6.24_21.84 total_acc:21.84_37.44 total_acc:37.44_40.56 total_acc:>=40.56
| total_acc | total_acc_factor | total_acc:0_6.24 | total_acc:6.24_21.84 | total_acc:21.84_37.44 | total_acc:37.44_40.56 | total_acc:>=40.56 | |
|---|---|---|---|---|---|---|---|
| 193817 | 6.0 | (4.68, 7.02] | 1 | 0 | 0 | 0 | 0 |
| 174039 | 16.0 | (14.04, 16.38] | 0 | 1 | 0 | 0 | 0 |
"""
total_acc:0_6.24
total_acc:6.24_21.84
total_acc:21.84_37.44
total_acc:37.44_40.56
total_acc:>=40.56
""";
col = 'acc_now_delinq'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 5 nulls = 0
193817 0.0 174039 0.0 Name: acc_now_delinq, dtype: float32
X_prep[col].value_counts()
0.0 92922 1.0 323 2.0 9 3.0 2 4.0 1 Name: acc_now_delinq, dtype: int64
X_prep[col+':0'] = \
np.where((X_prep[col] ==0), 1, 0)
X_prep[col+'>=1'] = \
np.where((X_prep[col] > 3), 1, 0)
"""
acc_now_delinq:0
acc_now_delinq:>=1
""";
col = 'total_rev_hi_lim'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 4925 nulls = 0
193817 9600.0 174039 7300.0 Name: total_rev_hi_lim, dtype: float32
X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
| total_rev_hi_lim_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (-1090.7, 10907.0] | 21484 | -0.173011 | 7121.0 | n_obs: 21,484<br>WoE: -0.173<br> |
| 1 | (10907.0, 21814.0] | 28605 | -0.101744 | 10872.0 | n_obs: 28,605<br>WoE: -0.1017<br> |
| 2 | (21814.0, 32721.0] | 17733 | 0.011071 | 7346.0 | n_obs: 17,733<br>WoE: 0.0111<br> |
| 3 | (32721.0, 43628.0] | 10387 | 0.137990 | 4584.0 | n_obs: 10,387<br>WoE: 0.138<br> |
| 4 | (43628.0, 54535.0] | 5803 | 0.256852 | 2541.0 | n_obs: 5,803<br>WoE: 0.2569<br> |
df_woe = df_woe[df_woe.n_obs > 1000]
df_woe
| total_rev_hi_lim_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (-1090.7, 10907.0] | 21484 | -0.173011 | 7121.0 | n_obs: 21,484<br>WoE: -0.173<br> |
| 1 | (10907.0, 21814.0] | 28605 | -0.101744 | 10872.0 | n_obs: 28,605<br>WoE: -0.1017<br> |
| 2 | (21814.0, 32721.0] | 17733 | 0.011071 | 7346.0 | n_obs: 17,733<br>WoE: 0.0111<br> |
| 3 | (32721.0, 43628.0] | 10387 | 0.137990 | 4584.0 | n_obs: 10,387<br>WoE: 0.138<br> |
| 4 | (43628.0, 54535.0] | 5803 | 0.256852 | 2541.0 | n_obs: 5,803<br>WoE: 0.2569<br> |
| 5 | (54535.0, 65442.0] | 3262 | 0.444045 | 1303.0 | n_obs: 3,262<br>WoE: 0.444<br> |
| 6 | (65442.0, 76349.0] | 1959 | 0.521514 | 761.0 | n_obs: 1,959<br>WoE: 0.5215<br> |
| 7 | (76349.0, 87256.0] | 1198 | 0.580151 | 398.0 | n_obs: 1,198<br>WoE: 0.5802<br> |
plot_woe(df_woe,xrot=90,color='k')
"""
total_rev_hi_lim:<=5k
total_rev_hi_lim:5k_10k
total_rev_hi_lim:10k_20k
total_rev_hi_lim:20k_30k
total_rev_hi_lim:30k_40k
total_rev_hi_lim:40k_50k
total_rev_hi_lim:50k_65k
total_rev_hi_lim:65k_80k
total_rev_hi_lim:>80k
""";
X_prep['total_rev_hi_lim:<=5k'] = \
np.where((X_prep['total_rev_hi_lim'] <= 5000), 1, 0)
X_prep['total_rev_hi_lim:5k_10k'] = \
np.where((X_prep['total_rev_hi_lim'] > 5000) &
(X_prep['total_rev_hi_lim'] <= 10000), 1, 0)
X_prep['total_rev_hi_lim:10k_20k'] = \
np.where((X_prep['total_rev_hi_lim'] > 10000) &
(X_prep['total_rev_hi_lim'] <= 20000), 1, 0)
X_prep['total_rev_hi_lim:20k_30k'] = \
np.where((X_prep['total_rev_hi_lim'] > 20000) &
(X_prep['total_rev_hi_lim'] <= 30000), 1, 0)
X_prep['total_rev_hi_lim:30k_40k'] = \
np.where((X_prep['total_rev_hi_lim'] > 30000) &
(X_prep['total_rev_hi_lim'] <= 40000), 1, 0)
X_prep['total_rev_hi_lim:40k_50k'] = \
np.where((X_prep['total_rev_hi_lim'] > 40000) &
(X_prep['total_rev_hi_lim'] <= 50000), 1, 0)
X_prep['total_rev_hi_lim:50k_65k'] = \
np.where((X_prep['total_rev_hi_lim'] > 50000) &
(X_prep['total_rev_hi_lim'] <= 65000), 1, 0)
X_prep['total_rev_hi_lim:65k_80k'] = \
np.where((X_prep['total_rev_hi_lim'] > 65000) &
(X_prep['total_rev_hi_lim'] <= 80000), 1, 0)
X_prep['total_rev_hi_lim:>80k'] = np.where((X_prep['total_rev_hi_lim'] > 80000), 1, 0)
col = 'installment'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 26843 nulls = 0
193817 177.889999 174039 210.360001 Name: installment, dtype: float32
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
| installment_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (14.518, 43.754] | 454 | 0.159373 | 780.0 | n_obs: 454<br>WoE: 0.1594<br> |
| 1 | (43.754, 71.599] | 1234 | 0.130200 | 476.0 | n_obs: 1,234<br>WoE: 0.1302<br> |
| 2 | (71.599, 99.443] | 1710 | 0.067130 | 572.0 | n_obs: 1,710<br>WoE: 0.0671<br> |
| 3 | (99.443, 127.288] | 2282 | -0.038149 | 144.0 | n_obs: 2,282<br>WoE: -0.0381<br> |
| 4 | (127.288, 155.132] | 2426 | 0.026251 | 1816.0 | n_obs: 2,426<br>WoE: 0.0263<br> |
df_woe = df_woe[df_woe.n_obs > 500]
df_woe.style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})
| installment_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 1 | (43.754, 71.599] | 1,234 | 0.130200 | 476 | n_obs: 1,234 WoE: 0.1302 |
| 2 | (71.599, 99.443] | 1,710 | 0.067130 | 572 | n_obs: 1,710 WoE: 0.0671 |
| 3 | (99.443, 127.288] | 2,282 | -0.038149 | 144 | n_obs: 2,282 WoE: -0.0381 |
| 4 | (127.288, 155.132] | 2,426 | 0.026251 | 1,816 | n_obs: 2,426 WoE: 0.0263 |
| 5 | (155.132, 182.976] | 4,242 | 0.033381 | 41 | n_obs: 4,242 WoE: 0.0334 |
| 6 | (182.976, 210.821] | 4,283 | 0.126215 | 541 | n_obs: 4,283 WoE: 0.1262 |
| 7 | (210.821, 238.665] | 3,742 | 0.096831 | 821 | n_obs: 3,742 WoE: 0.0968 |
| 8 | (238.665, 266.51] | 4,563 | 0.129521 | 422 | n_obs: 4,563 WoE: 0.1295 |
| 9 | (266.51, 294.354] | 4,985 | -0.028918 | 187 | n_obs: 4,985 WoE: -0.0289 |
| 10 | (294.354, 322.198] | 5,172 | 0.067806 | 1,278 | n_obs: 5,172 WoE: 0.0678 |
| 11 | (322.198, 350.043] | 6,450 | -0.025363 | 1,643 | n_obs: 6,450 WoE: -0.0254 |
| 12 | (350.043, 377.887] | 4,807 | -0.101960 | 340 | n_obs: 4,807 WoE: -0.102 |
| 13 | (377.887, 405.732] | 4,467 | 0.126107 | 1,182 | n_obs: 4,467 WoE: 0.1261 |
| 14 | (405.732, 433.576] | 3,285 | -0.125048 | 120 | n_obs: 3,285 WoE: -0.125 |
| 15 | (433.576, 461.42] | 3,165 | -0.135761 | 706 | n_obs: 3,165 WoE: -0.1358 |
| 16 | (461.42, 489.265] | 3,871 | 0.099491 | 81 | n_obs: 3,871 WoE: 0.0995 |
| 17 | (489.265, 517.109] | 3,952 | -0.078377 | 1,163 | n_obs: 3,952 WoE: -0.0784 |
| 18 | (517.109, 544.954] | 2,789 | -0.201587 | 101 | n_obs: 2,789 WoE: -0.2016 |
| 19 | (544.954, 572.798] | 2,688 | -0.016065 | 357 | n_obs: 2,688 WoE: -0.0161 |
| 20 | (572.798, 600.642] | 2,331 | -0.177846 | 33 | n_obs: 2,331 WoE: -0.1778 |
| 21 | (600.642, 628.487] | 2,298 | 0.183212 | 142 | n_obs: 2,298 WoE: 0.1832 |
| 22 | (628.487, 656.331] | 2,156 | 0.076731 | 123 | n_obs: 2,156 WoE: 0.0767 |
| 23 | (656.331, 684.176] | 2,279 | 0.046221 | 687 | n_obs: 2,279 WoE: 0.0462 |
| 24 | (684.176, 712.02] | 1,592 | -0.208344 | 376 | n_obs: 1,592 WoE: -0.2083 |
| 25 | (712.02, 739.864] | 1,216 | 0.068516 | 113 | n_obs: 1,216 WoE: 0.0685 |
| 26 | (739.864, 767.709] | 1,329 | 0.167071 | 145 | n_obs: 1,329 WoE: 0.1671 |
| 27 | (767.709, 795.553] | 1,184 | 0.075039 | 138 | n_obs: 1,184 WoE: 0.075 |
| 28 | (795.553, 823.398] | 1,046 | 0.138044 | 90 | n_obs: 1,046 WoE: 0.138 |
| 29 | (823.398, 851.242] | 956 | 0.026938 | 88 | n_obs: 956 WoE: 0.0269 |
| 30 | (851.242, 879.086] | 1,044 | 0.051178 | 272 | n_obs: 1,044 WoE: 0.0512 |
| 31 | (879.086, 906.931] | 772 | -0.060162 | 204 | n_obs: 772 WoE: -0.0602 |
| 32 | (906.931, 934.775] | 568 | -0.360655 | 35 | n_obs: 568 WoE: -0.3607 |
| 33 | (934.775, 962.62] | 603 | -0.234802 | 126 | n_obs: 603 WoE: -0.2348 |
plot_woe(df_woe,xrot=90,color='k')
bins = [0, 127,210,266,294,322,378,406,433,517,545,
629,712,796,852,880,935,963]
col = 'installment'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
installment:0_127 installment:127_210 installment:210_266 installment:266_294 installment:294_322 installment:322_378 installment:378_406 installment:406_433 installment:433_517 installment:517_545 installment:545_629 installment:629_712 installment:712_796 installment:796_852 installment:852_880 installment:880_935 installment:935_963 installment:>=963
| installment | installment_factor | installment:0_127 | installment:127_210 | installment:210_266 | installment:266_294 | installment:294_322 | installment:322_378 | installment:378_406 | installment:406_433 | installment:433_517 | installment:517_545 | installment:545_629 | installment:629_712 | installment:712_796 | installment:796_852 | installment:852_880 | installment:880_935 | installment:935_963 | installment:>=963 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 193817 | 177.889999 | (155.132, 182.976] | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 174039 | 210.360001 | (182.976, 210.821] | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
"""
installment:0_127
installment:127_210
installment:210_266
installment:266_294
installment:294_322
installment:322_378
installment:378_406
installment:406_433
installment:433_517
installment:517_545
installment:545_629
installment:629_712
installment:712_796
installment:796_852
installment:852_880
installment:880_935
installment:935_963
installment:>=963
""";
col = 'annual_inc'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 8952 nulls = 0
193817 18572.849609 174039 20000.000000 Name: annual_inc, dtype: float32
X_prep[col].describe()
count 9.325700e+04 mean 7.338044e+04 std 5.973804e+04 min 2.000000e+03 25% 4.500000e+04 50% 6.300000e+04 75% 8.826200e+04 max 7.446395e+06 Name: annual_inc, dtype: float64
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
| annual_inc_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (-5444.395, 150887.9] | 89166 | -0.018654 | 85606.0 | n_obs: 89,166<br>WoE: -0.0187<br> |
| 1 | (150887.9, 299775.8] | 3560 | 0.498201 | 3169.0 | n_obs: 3,560<br>WoE: 0.4982<br> |
| 2 | (299775.8, 448663.7] | 391 | 0.503297 | 317.0 | n_obs: 391<br>WoE: 0.5033<br> |
| 3 | (448663.7, 597551.6] | 74 | 0.526648 | 47.0 | n_obs: 74<br>WoE: 0.5266<br> |
| 4 | (597551.6, 746439.5] | 27 | inf | 12.0 | n_obs: 27<br>WoE: inf<br> |
# we see too many obs in first category. increase bins to 100
X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
| annual_inc_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (-5444.395, 76443.95] | 61174 | -0.133182 | 33182.0 | n_obs: 61,174<br>WoE: -0.1332<br> |
| 1 | (76443.95, 150887.9] | 27992 | 0.274730 | 25035.0 | n_obs: 27,992<br>WoE: 0.2747<br> |
| 2 | (150887.9, 225331.85] | 2957 | 0.468022 | 2354.0 | n_obs: 2,957<br>WoE: 0.468<br> |
| 3 | (225331.85, 299775.8] | 603 | 0.658820 | 299.0 | n_obs: 603<br>WoE: 0.6588<br> |
| 4 | (299775.8, 374219.75] | 304 | 0.555222 | 217.0 | n_obs: 304<br>WoE: 0.5552<br> |
"""
Looking at factors of annual income, we can make two
groups with 150k (or 140k) and analyze the group
earning more than 140k separately.
""";
X_prep_tmp = X_prep[X_prep[col] <= 140000]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
y_prep_tmp = y_prep[X_prep_tmp.index]
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
| annual_inc_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (1862.0, 4760.0] | 4 | -0.976810 | 9.0 | n_obs: 4<br>WoE: -0.9768<br> |
| 1 | (4760.0, 7520.0] | 13 | -0.871449 | 58.0 | n_obs: 13<br>WoE: -0.8714<br> |
| 2 | (7520.0, 10280.0] | 71 | -0.482791 | 84.0 | n_obs: 71<br>WoE: -0.4828<br> |
| 3 | (10280.0, 13040.0] | 155 | -0.473706 | 126.0 | n_obs: 155<br>WoE: -0.4737<br> |
| 4 | (13040.0, 15800.0] | 281 | -0.662220 | 129.0 | n_obs: 281<br>WoE: -0.6622<br> |
X_prep_tmp[col].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1aa38c128>
plot_woe(df_woe,xrot=90,color='k')
"""
WoE is monotonically increasing.
We can break into 10 equal parts.
""";
col = 'annual_inc'
bins = [20_000,30_000,40_000,50_000,
60_000,70_000,80_000,90_000,
100_000,120_000,140_000]
# we need one more labels than bins
labels = ['<20k',
'20k_30k','30k_40k','40k_50k',
'50k_60k','60k_70k','70k_80k',
'80k_90k','90k_100k','100k_120k','120k_140k',
'>140k']
create_dummies_left_inclusive(X_prep,col,bins,labels)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
annual_inc:<20k annual_inc:20k_30k annual_inc:30k_40k annual_inc:40k_50k annual_inc:50k_60k annual_inc:60k_70k annual_inc:70k_80k annual_inc:80k_90k annual_inc:90k_100k annual_inc:100k_120k annual_inc:120k_140k annual_inc:>140k
| annual_inc | annual_inc_factor | annual_inc:<20k | annual_inc:20k_30k | annual_inc:30k_40k | annual_inc:40k_50k | annual_inc:50k_60k | annual_inc:60k_70k | annual_inc:70k_80k | annual_inc:80k_90k | annual_inc:90k_100k | annual_inc:100k_120k | annual_inc:120k_140k | annual_inc:>140k | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 193817 | 18572.849609 | (-5444.395, 76443.95] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 174039 | 20000.000000 | (-5444.395, 76443.95] | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
col = 'mths_since_last_delinq'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 113 nulls = 49936
193817 NaN 174039 22.0 Name: mths_since_last_delinq, dtype: float32
X_prep[col].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1a8c06b70>
# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals
print(col+':missing')
mths_since_last_delinq:missing
"""
Do not forget missing:
mths_since_last_delinq:missing
mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57
""";
X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log /Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning: invalid value encountered in subtract
| mths_since_last_delinq_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (-0.152, 3.04] | 1369 | -0.268930 | 608.0 | n_obs: 1,369<br>WoE: -0.2689<br> |
| 1 | (3.04, 6.08] | 1977 | -0.047493 | 515.0 | n_obs: 1,977<br>WoE: -0.0475<br> |
| 2 | (6.08, 9.12] | 2492 | -0.105770 | 34.0 | n_obs: 2,492<br>WoE: -0.1058<br> |
| 3 | (9.12, 12.16] | 2458 | -0.045329 | 19.0 | n_obs: 2,458<br>WoE: -0.0453<br> |
| 4 | (12.16, 15.2] | 2439 | 0.013693 | 108.0 | n_obs: 2,439<br>WoE: 0.0137<br> |
plot_woe(df_woe,xrot=90,color='k')
# month is an integer, make intervals integer.
# NOTE: create missing category
col = 'mths_since_last_delinq'
bins = [0,3,30,56]
create_bins_dummies_int(X_prep,col,bins)
mths_since_last_delinq:0_3 mths_since_last_delinq:4_30 mths_since_last_delinq:31_56 mths_since_last_delinq:>=57
cols = [i for i in X_prep.columns if 'delinq' in i]
X_prep[cols].head(2)
| delinq_2yrs | mths_since_last_delinq | acc_now_delinq | delinq_2yrs:0 | delinq_2yrs:1-3 | delinq_2yrs:>=3 | acc_now_delinq:0 | acc_now_delinq>=1 | mths_since_last_delinq:missing | mths_since_last_delinq:0_3 | mths_since_last_delinq:4_30 | mths_since_last_delinq:31_56 | mths_since_last_delinq:>=57 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 193817 | 0.0 | NaN | 0.0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 174039 | 1.0 | 22.0 | 0.0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
col = 'dti'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 3868 nulls = 0
193817 13.370000 174039 19.559999 Name: dti, dtype: float32
X_prep[col].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1a071db00>
# we see very few observation after 35, we make one group
# we can create temp df with range 0 to 35 and bin into 50
# parts
X_prep_tmp = X_prep[X_prep[col]<35]
y_prep_tmp = y_prep[X_prep_tmp.index]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
| dti_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (-0.035, 0.7] | 355 | -0.141722 | 81.0 | n_obs: 355<br>WoE: -0.1417<br> |
| 1 | (0.7, 1.4] | 436 | -0.007521 | 75.0 | n_obs: 436<br>WoE: -0.0075<br> |
| 2 | (1.4, 2.099] | 511 | 0.038375 | 134.0 | n_obs: 511<br>WoE: 0.0384<br> |
| 3 | (2.099, 2.799] | 645 | 0.517080 | 138.0 | n_obs: 645<br>WoE: 0.5171<br> |
| 4 | (2.799, 3.499] | 783 | 0.208073 | 209.0 | n_obs: 783<br>WoE: 0.2081<br> |
plot_woe(df_woe,xrot=90,color='k')
bins = [0,0.7,1.4,2.1,3.5,4.9,5.6,6.3,7,9,9.8,10.5,
11.2,11.9,12.6,13.3,14,14.7,17.5,18.2,21,21.7,
22.4,23.1,25.2,25.9,29.4,30.1,30.8,31.5,32.2,32.9,
33.6]
create_bins_dummies_cont(X_prep,col,bins)
dti:0_0.7 dti:0.7_1.4 dti:1.4_2.1 dti:2.1_3.5 dti:3.5_4.9 dti:4.9_5.6 dti:5.6_6.3 dti:6.3_7 dti:7_9 dti:9_9.8 dti:9.8_10.5 dti:10.5_11.2 dti:11.2_11.9 dti:11.9_12.6 dti:12.6_13.3 dti:13.3_14 dti:14_14.7 dti:14.7_17.5 dti:17.5_18.2 dti:18.2_21 dti:21_21.7 dti:21.7_22.4 dti:22.4_23.1 dti:23.1_25.2 dti:25.2_25.9 dti:25.9_29.4 dti:29.4_30.1 dti:30.1_30.8 dti:30.8_31.5 dti:31.5_32.2 dti:32.2_32.9 dti:32.9_33.6 dti:>=33.6
col = 'mths_since_last_record'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
unique = 123 nulls = 80742
193817 NaN 174039 NaN Name: mths_since_last_record, dtype: float32
X_prep[col].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x18bf28320>
# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals
print(col+':missing')
mths_since_last_record:missing
"""
mths_since_last_record:missing
mths_since_last_record:0_2
mths_since_last_record:3_22
mths_since_last_record:23_46
mths_since_last_record:47_68
mths_since_last_record:69_85
mths_since_last_record:>=86
""";
X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log
| mths_since_last_record_factor | n_obs | woe | diff_n_obs | text_plotly | |
|---|---|---|---|---|---|
| 0 | (-0.129, 2.58] | 287 | -1.175688 | 257.0 | n_obs: 287<br>WoE: -1.1757<br> |
| 1 | (2.58, 5.16] | 30 | -1.132177 | 2.0 | n_obs: 30<br>WoE: -1.1322<br> |
| 2 | (5.16, 7.74] | 32 | 1.290209 | 24.0 | n_obs: 32<br>WoE: 1.2902<br> |
| 3 | (7.74, 10.32] | 56 | 0.727902 | 13.0 | n_obs: 56<br>WoE: 0.7279<br> |
| 4 | (10.32, 12.9] | 43 | 0.133490 | 32.0 | n_obs: 43<br>WoE: 0.1335<br> |
plot_woe(df_woe,xrot=90,color='k')
# look at large falling line and create groups
bins = [0,2,22,46,68,85]
create_bins_dummies_int(X_prep,col,bins)
mths_since_last_record:0_2 mths_since_last_record:3_22 mths_since_last_record:23_46 mths_since_last_record:47_68 mths_since_last_record:69_85 mths_since_last_record:>=86
print(X_prep.shape)
X_prep.head(2)
(93257, 351)
| Unnamed: 0 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | acc_now_delinq | tot_coll_amt | tot_cur_bal | total_rev_hi_lim | issue_d_date | earliest_cr_line_date | last_pymnt_d_date | last_credit_pull_d_date | mths_since_issue_d | mths_since_earliest_cr_line | mths_since_last_pymnt_d | mths_since_last_credit_pull_d | term_int | emp_length_int | loan_income_ratio | grade:A | grade:B | grade:C | grade:D | grade:E | grade:F | grade:G | sub_grade:A1 | sub_grade:A2 | sub_grade:A3 | sub_grade:A4 | sub_grade:A5 | sub_grade:B1 | sub_grade:B2 | sub_grade:B3 | sub_grade:B4 | sub_grade:B5 | sub_grade:C1 | sub_grade:C2 | sub_grade:C3 | sub_grade:C4 | sub_grade:C5 | sub_grade:D1 | sub_grade:D2 | sub_grade:D3 | sub_grade:D4 | sub_grade:D5 | sub_grade:E1 | sub_grade:E2 | sub_grade:E3 | sub_grade:E4 | sub_grade:E5 | sub_grade:F1 | sub_grade:F2 | sub_grade:F3 | sub_grade:F4 | sub_grade:F5 | sub_grade:G1 | sub_grade:G2 | sub_grade:G3 | sub_grade:G4 | sub_grade:G5 | home_ownership:ANY | home_ownership:MORTGAGE | home_ownership:NONE | home_ownership:OTHER | home_ownership:OWN | home_ownership:RENT | verification_status:Not Verified | verification_status:Source Verified | verification_status:Verified | loan_status:Charged Off | loan_status:Current | loan_status:Default | loan_status:Does not meet the credit policy. Status:Charged Off | loan_status:Does not meet the credit policy. Status:Fully Paid | loan_status:Fully Paid | loan_status:In Grace Period | loan_status:Late (16-30 days) | loan_status:Late (31-120 days) | purpose:car | purpose:credit_card | purpose:debt_consolidation | purpose:educational | purpose:home_improvement | purpose:house | purpose:major_purchase | purpose:medical | purpose:moving | purpose:other | purpose:renewable_energy | purpose:small_business | purpose:vacation | purpose:wedding | addr_state:AK | addr_state:AL | addr_state:AR | addr_state:AZ | addr_state:CA | addr_state:CO | addr_state:CT | addr_state:DC | addr_state:DE | addr_state:FL | addr_state:GA | addr_state:HI | addr_state:IA | addr_state:ID | addr_state:IL | addr_state:IN | addr_state:KS | addr_state:KY | addr_state:LA | addr_state:MA | addr_state:MD | addr_state:ME | addr_state:MI | addr_state:MN | addr_state:MO | addr_state:MS | addr_state:MT | addr_state:NC | addr_state:NE | addr_state:NH | addr_state:NJ | addr_state:NM | addr_state:NV | addr_state:NY | addr_state:OH | addr_state:OK | addr_state:OR | addr_state:PA | addr_state:RI | addr_state:SC | addr_state:SD | addr_state:TN | addr_state:TX | addr_state:UT | addr_state:VA | addr_state:VT | addr_state:WA | addr_state:WI | addr_state:WV | addr_state:WY | initial_list_status:f | initial_list_status:w | home_ownership:RENT_OTHER_NONE_ANY | addr_state:ND | addr_state:ND_NE_IA_NV_FL_HI_AL | addr_state:NM_VA | addr_state:OK_TN_MO_LA_MD_NC | addr_state:UT_KY_AZ_NJ | addr_state:AR_MI_PA_OH_MN | addr_state:RI_MA_DE_SD_IN | addr_state:GA_WA_OR | addr_state:WI_MT | addr_state:IL_CT | addr_state:KS_SC_CO_VT_AK_MS | addr_state:WV_NH_WY_DC_ME_ID | purpose:small_business__educational__moving__renewable_energy | purpose:house__medical__wedding__vacation | purpose:home_improvement__major_purchase__car | term:36 | term:60 | emp_length:0 | emp_length:1 | emp_length:2_3 | emp_length:4_5 | emp_length:6_7 | emp_length:8_9 | emp_length:10 | mths_since_issue_d_factor | mths_since_issue_d:0_66 | mths_since_issue_d:67_70 | mths_since_issue_d:71_75 | mths_since_issue_d:76_82 | mths_since_issue_d:83_93 | mths_since_issue_d:94_128 | mths_since_issue_d:>=129 | int_rate_factor | int_rate:0_9.961 | int_rate:9.961_12.025 | int_rate:12.025_15.74 | int_rate:15.74_20.281 | int_rate:>=20.281 | funded_amnt_factor | mths_since_earliest_cr_line_factor | mths_since_earliest_cr_line:0_207 | mths_since_earliest_cr_line:208_350 | mths_since_earliest_cr_line:>=351 | delinq_2yrs:0 | delinq_2yrs:1-3 | delinq_2yrs:>=3 | inq_last_6mths:0 | inq_last_6mths:1 | inq_last_6mths:2_3 | inq_last_6mths:>3 | open_acc:<=3 | open_acc:4_5 | open_acc:6_13 | open_acc:13_17 | open_acc:>17 | pub_rec:0 | pub_rec:1 | pub_rec:2 | pub_rec:3 | pub_rec:>3 | total_acc_factor | total_acc:0_6.24 | total_acc:6.24_21.84 | total_acc:21.84_37.44 | total_acc:37.44_40.56 | total_acc:>=40.56 | acc_now_delinq:0 | acc_now_delinq>=1 | total_rev_hi_lim_factor | total_rev_hi_lim:<=5k | total_rev_hi_lim:5k_10k | total_rev_hi_lim:10k_20k | total_rev_hi_lim:20k_30k | total_rev_hi_lim:30k_40k | total_rev_hi_lim:40k_50k | total_rev_hi_lim:50k_65k | total_rev_hi_lim:65k_80k | total_rev_hi_lim:>80k | installment_factor | installment:0_127 | installment:127_210 | installment:210_266 | installment:266_294 | installment:294_322 | installment:322_378 | installment:378_406 | installment:406_433 | installment:433_517 | installment:517_545 | installment:545_629 | installment:629_712 | installment:712_796 | installment:796_852 | installment:852_880 | installment:880_935 | installment:935_963 | installment:>=963 | annual_inc_factor | annual_inc:<20k | annual_inc:20k_30k | annual_inc:30k_40k | annual_inc:40k_50k | annual_inc:50k_60k | annual_inc:60k_70k | annual_inc:70k_80k | annual_inc:80k_90k | annual_inc:90k_100k | annual_inc:100k_120k | annual_inc:120k_140k | annual_inc:>140k | mths_since_last_delinq:missing | mths_since_last_delinq:0_3 | mths_since_last_delinq:4_30 | mths_since_last_delinq:31_56 | mths_since_last_delinq:>=57 | dti:0_0.7 | dti:0.7_1.4 | dti:1.4_2.1 | dti:2.1_3.5 | dti:3.5_4.9 | dti:4.9_5.6 | dti:5.6_6.3 | dti:6.3_7 | dti:7_9 | dti:9_9.8 | dti:9.8_10.5 | dti:10.5_11.2 | dti:11.2_11.9 | dti:11.9_12.6 | dti:12.6_13.3 | dti:13.3_14 | dti:14_14.7 | dti:14.7_17.5 | dti:17.5_18.2 | dti:18.2_21 | dti:21_21.7 | dti:21.7_22.4 | dti:22.4_23.1 | dti:23.1_25.2 | dti:25.2_25.9 | dti:25.9_29.4 | dti:29.4_30.1 | dti:30.1_30.8 | dti:30.8_31.5 | dti:31.5_32.2 | dti:32.2_32.9 | dti:32.9_33.6 | dti:>=33.6 | mths_since_last_record:missing | mths_since_last_record:0_2 | mths_since_last_record:3_22 | mths_since_last_record:23_46 | mths_since_last_record:47_68 | mths_since_last_record:69_85 | mths_since_last_record:>=86 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 193817 | 193817 | 1581078 | 1850197 | 5500 | 5500 | 5500.0 | 36 months | 10.16 | 177.889999 | B | B1 | UPS | 5 years | RENT | 18572.849609 | Verified | Oct-12 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.... | NaN | debt_consolidation | Debt consolidation | 906xx | CA | 13.370000 | 0.0 | Apr-07 | 1.0 | NaN | NaN | 5.0 | 0.0 | 4892 | 51.0 | 6.0 | f | 0.0 | 0.0 | 6402.621094 | 6402.620117 | 5500.0 | 902.619995 | 0.0 | 0.0 | 0.0 | Oct-15 | 181.750000 | NaN | Jan-16 | 0.0 | NaN | 1 | INDIVIDUAL | 0.0 | 0.0 | 6632.0 | 9600.0 | 2012-10-01 | 2007-04-01 | 2015-10-01 | 2016-01-01 | 90.0 | 156.0 | 54.0 | 51.0 | 36 | 5.0 | 0.296131 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | (89.2, 91.0] | 0 | 0 | 0 | 0 | 1 | 0 | 0 | (9.961, 10.374] | 0 | 1 | 0 | 0 | 0 | (5330.0, 6020.0] | (135.4, 159.38] | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | (4.68, 7.02] | 1 | 0 | 0 | 0 | 0 | 1 | 0 | (-1090.7, 10907.0] | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (155.132, 182.976] | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (-5444.395, 76443.95] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 174039 | 174039 | 3055729 | 3728105 | 6000 | 6000 | 6000.0 | 36 months | 15.80 | 210.360001 | C | C3 | peshastin market | < 1 year | OWN | 20000.000000 | Not Verified | Jan-13 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.... | NaN | debt_consolidation | credit card pay off | 988xx | WA | 19.559999 | 1.0 | Nov-95 | 0.0 | 22.0 | NaN | 7.0 | 0.0 | 5291 | 72.5 | 16.0 | f | 0.0 | 0.0 | 7468.176270 | 7468.180176 | 6000.0 | 1468.180054 | 0.0 | 0.0 | 0.0 | Apr-15 | 2004.670044 | NaN | May-15 | 0.0 | 29.0 | 1 | INDIVIDUAL | 0.0 | 0.0 | 11685.0 | 7300.0 | 2013-01-01 | 1995-11-01 | 2015-04-01 | 2015-05-01 | 87.0 | 293.0 | 60.0 | 59.0 | 36 | 1.0 | 0.300000 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | (85.6, 87.4] | 0 | 0 | 0 | 0 | 1 | 0 | 0 | (15.74, 16.153] | 0 | 0 | 0 | 1 | 0 | (5330.0, 6020.0] | (279.28, 303.26] | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | (14.04, 16.38] | 0 | 1 | 0 | 0 | 0 | 1 | 0 | (-1090.7, 10907.0] | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (182.976, 210.821] | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (-5444.395, 76443.95] | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
# 1. after saving, comment these two lines
# 2. go up in notebook where we have defined X_prep
# and make it test and run all the cells below that.
X_prep.to_csv(dat_pro + 'Xtrain.csv')
y_prep.to_csv(dat_pro + 'ytrain.csv')
# after running this, comment it and uncomment train part
# we always create from train, not from test.
# X_prep.to_csv(dat_pro + 'Xtest.csv')
# y_prep.to_csv(dat_pro + 'ytest.csv')